A bipartite network of human diseases and their associated symptoms, as extracted from PubMed's biomedical literature in c.2014 using TF-IDF weighted co-occurrences derived from the MeSH metadata field. Edges are weighted by the TF-IDF score and PubMed occurrence count, and nodes are labeled with the associated MeSH disease or symptom term
# Installing packages
!pip install python_louvain
# Importing libraries
import pandas as pd
import numpy as np
import networkx as nx
import collections
import statistics as stats
import time
from matplotlib import pyplot as plt
import seaborn as sns
import json
import matplotlib.pyplot as plt
import pandas as pd
from networkx.algorithms import bipartite
from community import community_louvain # for nxv2
# disable auto-scrolling
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
return false;
}
List of all 4,442 diseases within PubMed and their occurrence. (TXT 113 kb)
List of all 322 symptoms within PubMed and their occurrence. (TXT 6 kb)
Term co-occurrences between symptoms and diseases measured by TF-IDF weighted values. This table includes 147,978 records of symptom and disease relationships. (TXT 7797 kb)
List of disease links in the disease network with both significant shared symptoms and shared genes/PPIs. In total there are 133,106 such connections between 1,596 distinct diseases. (TXT 7801 kb)
data1 = pd.read_csv("data/41467_2014_BFncomms5212_MOESM1043_ESM.txt", delimiter = "\t")
data2 = pd.read_csv("data/41467_2014_BFncomms5212_MOESM1044_ESM.txt", delimiter = "\t")
data3 = pd.read_csv("data/41467_2014_BFncomms5212_MOESM1045_ESM.txt", delimiter = "\t")
data4 = pd.read_csv("data/41467_2014_BFncomms5212_MOESM1046_ESM.txt", delimiter = "\t")
data1
data2
data3
data4
data1.plot(kind='scatter',x='MeSH Disease Term',y='PubMed occurrence',color='red')
plt.show()
data2.plot(kind='scatter',x='MeSH Symptom Term',y='PubMed occurrence',color='red')
plt.show()
x = list(data3["PubMed occurrence"])[0:1000]
hist, bin_edges = np.histogram(x, bins=[i for i in range(0,100)])
plt.plot(bin_edges[0:len(hist)], hist,'o');
plt.xlabel("Symptom and disease co-occurrence")
plt.ylabel("Number or records")
plt.title("Symptom and disease co-occurrence distribution")
plt.loglog(bin_edges[0:len(hist)], hist,'o');
plt.xlabel("Symptom and disease co-occurrence")
plt.ylabel("Number or records")
plt.title("Symptom and disease co-occurrence distribution")
data_sd = data3[data3["PubMed occurrence"]>=100]
data_sd = data_sd[data_sd["MeSH Symptom Term"]!=data_sd["MeSH Disease Term"]]
data_sd
edges = data_sd.reindex(columns=["MeSH Symptom Term","MeSH Disease Term"])
edges
part0 = edges['MeSH Symptom Term'].unique()
len(part0)
part1 = edges['MeSH Disease Term'].unique()
len(part1)
def intersection(lst1, lst2):
lst3 = [value for value in lst1 if value in lst2]
return lst3
# Find common symptoms - disseases, not useful
comm = intersection(part0, part1)
len(comm)
# Remove the rows that are in comm
edges = edges[~edges['MeSH Symptom Term'].isin(comm)]
# Delete duplicate rows
edges_ = edges.drop_duplicates(subset=["MeSH Symptom Term","MeSH Disease Term"], keep=False)
edges_
part0 = edges_['MeSH Symptom Term'].unique()
#part0
part1 = edges_['MeSH Disease Term'].unique()
#part1
## Symptom // Disease
joins = list(edges_.to_records(index=False))
#joins
BI = nx.Graph()
BI.add_nodes_from(part0, bipartite=0)
BI.add_nodes_from(part1, bipartite=1)
BI.add_edges_from(joins)
print(nx.info(BI))
# Taking the largest connected component
components = sorted(nx.connected_components(BI), key=len, reverse=True)
largest_component = components[0]
BII = BI.subgraph(largest_component)
print(nx.info(BII))
## Symptoms // Diseases
fig = plt.figure(figsize = (30, 50))
ax = fig.add_subplot(111)
ax.axis('off')
N1, N2 = bipartite.sets(BII)
pos = dict()
pos.update( (n, (1, i)) for i, n in enumerate(N1) ) # put nodes from N1
pos.update( (n, (2, i)) for i, n in enumerate(N2) ) # put nodes from N2
nx.draw(BII, pos=pos, with_labels=True)
plt.show()
Gph_N1 = bipartite.projected_graph(BII, N1, multigraph=False)
Gph_N2 = bipartite.projected_graph(BII, N2, multigraph=False)
print(nx.info(Gph_N1))
print(nx.info(Gph_N2))
t = time.time()
spring_pos = nx.spring_layout(Gph_N1) # might take a little while
elapsed = time.time() - t
print('Time elapsed to get the graph layout: ', elapsed)
fig = plt.figure(figsize = (40, 30))
ax = fig.add_subplot(111)
ax.axis('off')
node_size_default = 40
n = nx.draw_networkx(Gph_N1,
spring_pos,
ax = ax,
node_size = node_size_default,
with_labels = True)
plt.title("Entire graph - Default node size")
plt.close();
fig
t = time.time()
spring_pos = nx.spring_layout(Gph_N2) # might take a little while
elapsed = time.time() - t
print('Time elapsed to get the graph layout: ', elapsed)
fig = plt.figure(figsize = (40, 30))
ax = fig.add_subplot(111)
ax.axis('off')
node_size_default = 40
n = nx.draw_networkx(Gph_N2,
spring_pos,
ax = ax,
node_size = node_size_default,
with_labels = True)
plt.title("Entire graph - Default node size")
plt.close();
fig
# Network metric statistics
def network_metric_statistics(metric_data):
avg = stats.mean(metric_data)
med = stats.median(metric_data)
std = stats.stdev(metric_data)
return("Here is a quick summary of your data: average = " + '{:.5f}'.format(avg) + ", median = " + '{:.5f}'.format(med) + ", standard deviation = " + '{:.5f}'.format(std))
#Gph_N1.degree
N1s = [d for d in N1]
N1s_degrees = [Gph_N1.degree[d] for d in N1]
N1s_order = [x for y, x in sorted(zip(N1s_degrees, N1s), reverse=True)]
N1s_degrees_order = sorted((Gph_N1.degree[d] for d in N1), reverse=True)
print("TOP 5 N1: \n",N1s_order[:5]) # from largest to smallest degree value
print("TOP 5 N1 DEGREES: \n",N1s_degrees_order[:5]) # from largest to smallest degree value
network_metric_statistics(N1s_degrees_order)
degree_count = collections.Counter(N1s_degrees_order)
deg, cnt = zip(*degree_count.items())
plt.figure(figsize=(8,5))
plt.bar(deg, cnt, width=1, color='b')
#plt.loglog(deg, cnt, 'o')
plt.xlabel("Node degree size", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.title("Entire graph - Node degree distribution", fontsize=14)
plt.show()
#Gph_N2.degree
N2s = [d for d in N2]
N2s_degrees = [Gph_N2.degree[d] for d in N2]
N2s_order = [x for y, x in sorted(zip(N2s_degrees, N2s), reverse=True)]
N2s_degrees_order = sorted((Gph_N2.degree[d] for d in N2), reverse=True)
print("TOP 5 N2: \n",N2s_order[:5]) # from largest to smallest degree value
print("TOP 5 N2 DEGREES: \n",N2s_degrees_order[:5]) # from largest to smallest degree value
network_metric_statistics(N2s_degrees_order)
degree_count = collections.Counter(N2s_degrees_order)
deg, cnt = zip(*degree_count.items())
plt.figure(figsize=(8,5))
plt.bar(deg, cnt, width=1, color='b')
#plt.plot(deg, cnt, 'o')
#plt.loglog(deg, cnt, 'o')
plt.xlabel("Node degree size", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.title("Entire graph - Node degree distribution", fontsize=14)
plt.show()
# degree centrality
deg = nx.degree_centrality(Gph_N1)
sorted(deg.items(), key=lambda item: item[1], reverse=True)[:5]
# closeness centrality
closeness = nx.closeness_centrality(Gph_N1)
sorted(closeness.items(), key=lambda item: item[1], reverse=True)[:5]
# eigenvector centrality
eig = nx.eigenvector_centrality(Gph_N1)
sorted(eig.items(), key=lambda item: item[1], reverse=True)[:5]
# betweeness centrality
betw = nx.betweenness_centrality(Gph_N1)
sorted(betw.items(), key=lambda item: item[1], reverse=True)[:5]
# degree centrality
deg = nx.degree_centrality(Gph_N2)
sorted(deg.items(), key=lambda item: item[1], reverse=True)[:5]
# closeness centrality
closeness = nx.closeness_centrality(Gph_N2)
sorted(closeness.items(), key=lambda item: item[1], reverse=True)[:5]
# eigenvector centrality
eig = nx.eigenvector_centrality(Gph_N2)
sorted(eig.items(), key=lambda item: item[1], reverse=True)[:5]
# betweeness centrality
betw = nx.betweenness_centrality(Gph_N2)
sorted(betw.items(), key=lambda item: item[1], reverse=True)[:5]
spring_pos = nx.spring_layout(Gph_N1) # might take a little while
node_size_default = 40
# partition = community.best_partition(GC) # idk if this works for v1
partition = community_louvain.best_partition(Gph_N1)
communities = [partition.get(node) for node in Gph_N1.nodes()]
print('The number of communities is ' + str(max(communities)) + '.')
# Let's assign each node to its given community
nx.set_node_attributes(Gph_N1, partition, name='community')
colors = [Gph_N1.nodes[n]['community'] for n in Gph_N1.nodes]
fig = plt.figure(figsize = (15, 15))
ax = fig.add_subplot(111)
ax.axis('off')
n = nx.draw_networkx(Gph_N1,
spring_pos,
ax = ax,
node_size = node_size_default,
with_labels = False,
node_color = communities)
plt.close();
fig
spring_pos = nx.spring_layout(Gph_N2) # might take a little while
node_size_default = 40
# partition = community.best_partition(GC) # idk if this works for v1
partition = community_louvain.best_partition(Gph_N2)
communities = [partition.get(node) for node in Gph_N2.nodes()]
num_communities = max(communities)+1
print('The number of communities is ' + str(num_communities))
# Let's assign each node to its given community
nx.set_node_attributes(Gph_N2, partition, name='community')
colors = [Gph_N2.nodes[n]['community'] for n in Gph_N2.nodes]
fig = plt.figure(figsize = (15, 15))
ax = fig.add_subplot(111)
ax.axis('off')
n = nx.draw_networkx(Gph_N2,
spring_pos,
ax = ax,
node_size = node_size_default,
with_labels = False,
node_color = communities)
plt.close();
fig
res = {}
for i, v in partition.items():
res[v] = [i] if v not in res.keys() else res[v] + [i]
print(res)
print(res[0])
def process_one_community(community_val):
# Consider the rows that are in comm with the disseases of the commmmunity
edges_tmp = edges_[edges_['MeSH Disease Term'].isin(community_val)]
syms_sorted_top = ["none"]
percent = 0
name_community = "none"
# filter communities with only more than 3 nodes of symptoms
if len(edges_tmp['MeSH Symptom Term'].unique())>3:
# Both parts of the bipartie
part0_tmp = edges_tmp['MeSH Symptom Term'].unique()
part1_tmp = edges_tmp['MeSH Disease Term'].unique()
# Symptom // Disease
joins_tmp = list(edges_tmp.to_records(index=False))
# Start Bigraph
BI_tmp = nx.Graph()
BI_tmp.add_nodes_from(part0_tmp, bipartite=0)
BI_tmp.add_nodes_from(part1_tmp, bipartite=1)
BI_tmp.add_edges_from(joins_tmp)
# Information
#print(nx.info(BI_tmp))
# Taking the largest connected component
#components = sorted(nx.connected_components(BI_tmp), key=len, reverse=True)
#largest_component = components[0]
#BII_tmp = BI_tmp.subgraph(largest_component)
# Information
#print(nx.info(BII_tmp))
# Graph: Symptoms // Diseases
fig = plt.figure(figsize = (12, 12))
ax = fig.add_subplot(111)
ax.axis('off')
N1, N2 = bipartite.sets(BI_tmp)
pos = dict()
pos.update( (n, (1, i)) for i, n in enumerate(N1) ) # put nodes from N1
pos.update( (n, (2, i)) for i, n in enumerate(N2) ) # put nodes from N2
nx.draw(BI_tmp, pos=pos, with_labels=True)
plt.show()
# extract degrees
degX, degY = bipartite.degrees(BI_tmp, N1)
degY = dict(degY)
syms_sorted = dict(sorted(degY.items(), key=lambda item: item[1], reverse=True))
# printing the sorted symptoms
for key, value in syms_sorted.items():
print(key, ': ', value)
for diss in list(N2):
if diss =="Cognition Disorders":
name_community = "Mental disorders"
if diss =="Bacterial Infections":
name_community = "Bacterial infections"
if diss =="Deficiency Diseases":
name_community = "Malnutrition deficiency"
if diss =="Lung Diseases":
name_community = "Lung or heart problems"
if diss =="Corneal Diseases":
name_community = "Vision or Pregnancy problems"
print("\nName community: ",name_community)
# select only the top symptoms with more or equal to 3 degree
syms_sorted_top = [k for k,v in syms_sorted.items() if float(v) >= 3]
# select the edges of the top symptoms
edges_tmp2 = edges_tmp[edges_tmp['MeSH Symptom Term'].isin(syms_sorted_top)]
# number of partial diseases detected with the symptoms in the community
partial_diseas_ = len(edges_tmp2['MeSH Disease Term'])
# only for verfication
print(edges_tmp2['MeSH Disease Term'].unique())
# number of all diseases in the community
all_diseas_ = len(edges_tmp['MeSH Disease Term'])
percent = round(partial_diseas_*100/all_diseas_,2)
print("\nTop symptoms:", syms_sorted_top)
print("\nTop symptoms detects the", str(percent), "%", "of the diseases in the community")
else:
print("\nCommunity too small")
return syms_sorted_top, percent, name_community
name_comm_all = []
tops_syms_all = []
percs_syms_all = []
for idx in range(0,num_communities):
print("\n\n\n####################### ####################### ####################### #######################")
print(idx)
tops_syms, percs_syms, name_comm = process_one_community(res[idx])
if tops_syms != ["none"]:
name_comm_all.append(name_comm)
tops_syms_all.append(tops_syms)
percs_syms_all.append(percs_syms)
# Show all the communities names
print(name_comm_all)
# List of all the symptoms
print(tops_syms_all)
# Some metrics
avg = stats.mean(percs_syms_all)
med = stats.median(percs_syms_all)
std = stats.stdev(percs_syms_all)
print("Percentages:", percs_syms_all)
print("Avg:", round(avg,2))
print("Std:", round(std,2))
print("Med:", round(med,2))
df = pd.DataFrame({'Symptoms': tops_syms_all, 'Disease Category': name_comm_all, 'Percentage': percs_syms_all})
df
def pred_categ_disease(symtoms):
for symt in symtoms:
for idx in range(0,len(df)):
if symt in df["Symptoms"][idx]:
diss_categ = df["Disease Category"][idx]
percent_categ = df["Percentage"][idx]
print("Predicted disease: "+diss_categ)
print("Top symptoms detects the "+str(percent_categ)+" % of the diseases in the community")
pred_categ_disease(["Weight Loss"])
pred_categ_disease(["Nausea"])
pred_categ_disease(["Dyspepsia"])
pred_categ_disease(["Chest Pain"])